from util import JS_div
import pickle
from gensim import corpora, models
import os
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import random
import sys

passes_number = 45
topic_number = 26
	
def list_trans(input_list):
	new = [0] * topic_number
	for eachone in input_list:
		new[eachone[0]] = eachone[1]
	return np.array(new) / np.sum(new)

if __name__ == '__main__':
	for int_batch_number in range(100):
		batch_number = str(int_batch_number)
		tmp_folder = '../data/tmp_files/new_topic_number_26/boosting_files/all_four/' + batch_number + '/'
		congress_tweet_dict = pickle.load(open(tmp_folder + 'congress_tweet_dict_filter_4th_sample_no_timelimit_' + batch_number, 'rb'))
		mayor_tweet_dict = pickle.load(open(tmp_folder + 'mayor_tweet_dict_filter_4th_sample_no_timelimit_' + batch_number, 'rb'))
		governor_tweet_dict = pickle.load(open(tmp_folder + 'governor_tweet_dict_filter_4th_sample_no_timelimit_' + batch_number, 'rb'))
		soccer_tweet_dict = pickle.load(open(tmp_folder + 'soccer_tweet_dict_filter_4th_sample_no_timelimit_' + batch_number, 'rb'))

		totalArticle = []
		for _, twitter_list in congress_tweet_dict.items():
			totalArticle += twitter_list
		for _, twitter_list in mayor_tweet_dict.items():
			totalArticle += twitter_list
		for _, twitter_list in governor_tweet_dict.items():
			totalArticle += twitter_list
		totalArticle_token = [eacharticle.split() for eacharticle in totalArticle]

		dictionary = corpora.Dictionary(totalArticle_token)
		corpus = [dictionary.doc2bow(article) for article in totalArticle_token]
		LDA_model = models.ldamulticore.LdaMulticore(corpus, num_topics=topic_number, id2word=dictionary, passes=passes_number, random_state=int(batch_number))
		pickle.dump(LDA_model, open(tmp_folder + 'lda_model_twitter_level_total_4th_sample_current_only_' + str(batch_number), 'wb'))
		pickle.dump(dictionary, open(tmp_folder + 'lda_dic_twitter_level_total_4th_sample_current_only_' + str(batch_number), 'wb'))


		LDA_model = pickle.load(open(tmp_folder + 'lda_model_twitter_level_total_4th_sample_current_only_' + str(batch_number), 'rb'))
		dictionary = pickle.load(open(tmp_folder + 'lda_dic_twitter_level_total_4th_sample_current_only_' + str(batch_number), 'rb'))
		# personal level
		congress_matrix = []
		congress_name_list = []
		for each_name, twitter_list in congress_tweet_dict.items():
			tmp_str = ""
			for each_twitter in twitter_list:
				if len(each_twitter) > 0:
					tmp_str += each_twitter + ' '
			congress_matrix.append(list_trans(LDA_model[dictionary.doc2bow(tmp_str.strip().split())]))
			congress_name_list.append(each_name)
		pickle.dump(congress_matrix, open(tmp_folder + 'congress_topic_model_matrix_4th_sample_current_only_' + str(batch_number), 'wb'))
		pickle.dump(congress_name_list, open(tmp_folder + 'congress_name_list_4th_sample_current_only_' + str(batch_number), 'wb'))

		mayor_matrix = []
		mayor_name_list = []
		for each_name, twitter_list in mayor_tweet_dict.items():
			try:
				tmp_str = ""
				for each_twitter in twitter_list:
					if len(each_twitter) > 0:
						tmp_str += each_twitter + ' '
				mayor_matrix.append(list_trans(LDA_model[dictionary.doc2bow(tmp_str.strip().split())]))
				mayor_name_list.append(each_name)
			except:
				print(each_name)
		pickle.dump(mayor_matrix, open(tmp_folder + 'mayor_topic_model_matrix_4th_sample_current_only_' + str(batch_number), 'wb'))
		pickle.dump(mayor_name_list, open(tmp_folder + 'mayor_name_list_4th_sample_current_only_' + str(batch_number), 'wb'))

		governor_matrix = []
		governor_name_list = []
		for each_name, twitter_list in governor_tweet_dict.items():
			try:
				tmp_str = ""
				for each_twitter in twitter_list:
					if len(each_twitter) > 0:
						tmp_str += each_twitter + ' '
				governor_matrix.append(list_trans(LDA_model[dictionary.doc2bow(tmp_str.strip().split())]))
				governor_name_list.append(each_name)
			except:
				print(each_name)
		pickle.dump(governor_matrix, open(tmp_folder + 'governor_topic_model_matrix_4th_sample_current_only_' + str(batch_number), 'wb'))
		pickle.dump(governor_name_list, open(tmp_folder + 'governor_name_list_4th_sample_current_only_' + str(batch_number), 'wb'))

		soccer_matrix = []
		soccer_name_list = []
		for each_name, twitter_list in soccer_tweet_dict.items():
			try:
				tmp_str = ""
				for each_twitter in twitter_list:
					if len(each_twitter) > 0:
						tmp_str += each_twitter + ' '
				soccer_matrix.append(list_trans(LDA_model[dictionary.doc2bow(tmp_str.strip().split())]))
				soccer_name_list.append(each_name)
			except:
				print(each_name)
		pickle.dump(soccer_matrix, open(tmp_folder + 'soccer_topic_model_matrix_4th_sample_current_only_' + str(batch_number), 'wb'))
		pickle.dump(soccer_name_list, open(tmp_folder + 'soccer_name_list_4th_sample_current_only_' + str(batch_number), 'wb'))
